############################################################ ########### #
#
# Project: Stability and change in the opinion-policy relationship
# 
# This script combines survey data with smoothers and cleans census data
# 
# 2022.09.30. 
############################################################ ########### #

library(rio)
library(here)
library(tidyr)
library(dyplr)


# import survey data --------------------------------------------------------------------

# 2021 data is saved in two separate files
a21 <- import("A-original-data/2021a.rds")
b21 <- import("A-original-data/2021b.rds")

# improved state level predictors (aka smoothers)
smoother <- rio::import("B-analysis-data/final_rentvote.rds")

# combine 2 2021 survey
df21 <- bind_rows(a21, b21) %>% 
        mutate(
                # id = as.numeric(paste0("2021000", 1:n())),
                state = as.character(tolower(state)),
                racecat = racecat - 1) %>% 
        glimpse

# the other surveys have been combined before
df13 <- import("A-original-data/cleaned_merged.rds") %>% 
        select(item, response, state, age, female, edu, racecat, year, stateyear) %>% 
        mutate(response = as.numeric(response),
               state = as.character(tolower(state)),
               year = ifelse(year == 2013, 2014, year))

# combine 2014-18 + 2021. 
df <- bind_rows(df13, df21) %>%
        mutate(year_std = (year - 2017.5) / 7) %>% 
        left_join(smoother) %>%
        glimpse

# export cleaned merged data. 
saveRDS(df, "B-analysis-data/survey_merged.rds")



# Data quality checks -----------------------------------------------------

# make sure we managed to add a unique identifier to each respondent
stopifnot(length(unique(df13$id)) + length(unique(df21$id)) == length(unique(df$id)))

# make sure state-names are identical
state_s <- smoother %>% 
        filter(!duplicated(state)) %>% 
        select(state)

state21 <- df21 %>% 
        filter(!duplicated(state)) %>% 
        select(state) %>% 
        arrange(state)

state13 <- df13 %>% 
        filter(!duplicated(state)) %>% 
        select(state) %>% 
        arrange(state)

stopifnot(identical(as.character(tolower(state_s$state)), tolower(state21$state)))
stopifnot(identical(as.character(tolower(state13$state)), tolower(state21$state)))


# import census ------------------------------------------------------------------


census_raw <- rio::import("B-analysis-data/census_13-19_raw.rds") %>% 
        mutate(item = "state") %>%      #add a new var with denoting state prefs
        rename(racecat = race) %>%  # tweak names to match survey data
        glimpse 

# double check that there is a perfect match between state names in survey and census
stopifnot(identical(sort(tolower(unique(census_raw$state))), sort(unique(df$state))))

# copy census 4 times, one for each year.
census <- census_raw %>% 
        mutate(year = case_when(year == 2019 ~ 2021,
                                TRUE ~ year + 1),
               year_std = (year - 2017.5) / 7,
               stateyear = paste(state, year, sep = " "),
               state = tolower(state)) %>% 
        group_by(state, year) %>% 
        mutate(statefreq = sum(freq)) %>% 
        ungroup() %>% 
        group_by(year) %>% 
        mutate(natfreq = sum(freq)) %>% 
        ungroup() %>% 
        left_join(smoother) %>% 
        glimpse()

saveRDS(census, "B-analysis-data/census.rds")


# Demographics ------------------------------------------------------------

round(prop.table(table(a21$age)), 3) * 100
round(prop.table(table(a21$racecat)), 3) * 100
round(prop.table(table(a21$edu)), 3) * 100
round(prop.table(table(a21$female)), 3) * 100


round(prop.table(table(b21$age)), 3) * 100
round(prop.table(table(b21$racecat)), 3) * 100
round(prop.table(table(b21$edu)), 3) * 100
round(prop.table(table(b21$female)), 3) * 100



round(prop.table(table(df$age)), 3) * 100
round(prop.table(table(df$racecat)), 3) * 100
round(prop.table(table(df$edu)), 3) * 100
round(prop.table(table(df$female)), 3) * 100